{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "21528ae2-45bb-4074-95e5-286b2aa30216",
      "metadata": {
        "deletable": false,
        "editable": true,
        "id": "21528ae2-45bb-4074-95e5-286b2aa30216",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "%session_id_prefix iceberg-dataframe-\n",
        "%glue_version 3.0\n",
        "%idle_timeout 60\n",
        "%connections <your-iceberg-connection-name>\n",
        "%%configure \n",
        "{\n",
        "  \"--conf\": \"spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\"\n",
        "}"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "9c175ad7-f103-47ec-9b81-6b045cc0bd40",
      "metadata": {
        "id": "9c175ad7-f103-47ec-9b81-6b045cc0bd40",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "catalog_name = \"glue_catalog\"\n",
        "bucket_name = \"<Your S3 bucket name>\"\n",
        "bucket_prefix = \"<Your S3 bucket prefix>\"\n",
        "database_name = \"iceberg_dataframe\"\n",
        "table_name = \"product\"\n",
        "warehouse_path = f\"s3://{bucket_name}/{bucket_prefix}\"\n",
        "dynamodb_table = 'myGlueLockTable'"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "c91cdff5",
      "metadata": {},
      "source": [
        "## Initialize SparkSession"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d4819a61-8d81-4cec-80b2-fd6e3d029969",
      "metadata": {
        "id": "d4819a61-8d81-4cec-80b2-fd6e3d029969",
        "outputId": "953601c3-824a-4f9b-90f3-e692c776a7e6",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "from pyspark.sql import SparkSession\n",
        "spark = SparkSession.builder \\\n",
        "    .config(f\"spark.sql.catalog.{catalog_name}\", \"org.apache.iceberg.spark.SparkCatalog\") \\\n",
        "    .config(f\"spark.sql.catalog.{catalog_name}.warehouse\", f\"{warehouse_path}\") \\\n",
        "    .config(f\"spark.sql.catalog.{catalog_name}.catalog-impl\", \"org.apache.iceberg.aws.glue.GlueCatalog\") \\\n",
        "    .config(f\"spark.sql.catalog.{catalog_name}.io-impl\", \"org.apache.iceberg.aws.s3.S3FileIO\") \\\n",
        "    .config(f\"spark.sql.catalog.{catalog_name}.lock-impl\", \"org.apache.iceberg.aws.glue.DynamoLockManager\") \\\n",
        "    .config(f\"spark.sql.catalog.{catalog_name}.lock.table\", f\"{dynamodb_table}\") \\\n",
        "    .config(\"spark.sql.extensions\",\"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\") \\\n",
        "    .getOrCreate()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "19d274a4-1610-43a6-a574-b18894a2f73b",
      "metadata": {
        "id": "19d274a4-1610-43a6-a574-b18894a2f73b"
      },
      "source": [
        "## Clean up existing resources"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "9ddef3b9-2b9f-42a8-a69b-ce4ecf637e49",
      "metadata": {
        "id": "9ddef3b9-2b9f-42a8-a69b-ce4ecf637e49",
        "outputId": "4b7cf09e-75fe-4d07-9c94-952942e99cc1",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "query = f\"\"\"\n",
        "DROP TABLE IF EXISTS {catalog_name}.{database_name}.{table_name}\n",
        "\"\"\"\n",
        "spark.sql(query)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "5078b110-be78-43fe-8ffe-93cd013c7b0c",
      "metadata": {
        "id": "5078b110-be78-43fe-8ffe-93cd013c7b0c"
      },
      "source": [
        "## Create Iceberg table with sample data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "fba92930-daf3-4b20-ae73-25b0b6c82d90",
      "metadata": {
        "id": "fba92930-daf3-4b20-ae73-25b0b6c82d90",
        "outputId": "be3ab456-697d-4644-ef4a-cf0ebdc1679e",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "query = f\"\"\"\n",
        "CREATE DATABASE IF NOT EXISTS {database_name}\n",
        "\"\"\"\n",
        "spark.sql(query)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "fba92930-daf3-4b20-ae73-25b0b6c82d90",
      "metadata": {
        "id": "fba92930-daf3-4b20-ae73-25b0b6c82d90",
        "outputId": "be3ab456-697d-4644-ef4a-cf0ebdc1679e",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "from pyspark.sql import Row\n",
        "import time\n",
        "\n",
        "ut = time.time()\n",
        "\n",
        "df_products = spark.createDataFrame(\n",
        "    [\n",
        "        (\"00001\", \"Heater\", 250, \"Electronics\", ut),\n",
        "        (\"00002\", \"Thermostat\", 400, \"Electronics\", ut),\n",
        "        (\"00003\", \"Television\", 600, \"Electronics\", ut),\n",
        "        (\"00004\", \"Blender\", 100, \"Electronics\", ut),\n",
        "        (\"00005\", \"Table\", 150, \"Furniture\", ut)\n",
        "    ],\n",
        "    [\"product_id\", \"product_name\", \"price\", \"category\", \"updated_at\"],\n",
        ")\n",
        "\n",
        "df_products.show()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c08925d5-cdfb-4155-b337-472fe24f55d1",
      "metadata": {
        "id": "c08925d5-cdfb-4155-b337-472fe24f55d1",
        "outputId": "6cbc723e-539a-4293-8ef3-5e1411a3fadf",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "df_products.sortWithinPartitions(\"category\") \\\n",
        "    .writeTo(f\"{catalog_name}.{database_name}.{table_name}\") \\\n",
        "    .create()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "60a6395f-d63b-479e-b00e-3aa92e6a3210",
      "metadata": {
        "id": "60a6395f-d63b-479e-b00e-3aa92e6a3210",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "spark.catalog.listTables(database_name)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "48d42e63-fc96-4047-96fb-ad388f16a8bb",
      "metadata": {
        "id": "48d42e63-fc96-4047-96fb-ad388f16a8bb"
      },
      "source": [
        "## Read from Iceberg table"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cb18a298-7934-46c4-a187-2960a77668d3",
      "metadata": {
        "id": "cb18a298-7934-46c4-a187-2960a77668d3",
        "outputId": "22b6e30c-e369-42f4-97f5-596bf06ae8bb",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "spark.table(f\"{catalog_name}.{database_name}.{table_name}\") \\\n",
        "    .show()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "5bbf8e99-ad2e-4216-ae41-9e4e2558f51e",
      "metadata": {
        "id": "5bbf8e99-ad2e-4216-ae41-9e4e2558f51e",
        "outputId": "d32dd590-6cc9-4f06-d58c-d4c9ae7e2dba",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "spark.table(f\"{catalog_name}.{database_name}.{table_name}.history\") \\\n",
        "    .show()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "edc9062b-d134-4e78-879e-d9c51e85fdad",
      "metadata": {
        "id": "edc9062b-d134-4e78-879e-d9c51e85fdad",
        "tags": []
      },
      "source": [
        "## Append records into Iceberg table"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "92624355-7e93-4c65-950c-3d0ccd0110b6",
      "metadata": {
        "id": "92624355-7e93-4c65-950c-3d0ccd0110b6",
        "outputId": "bc5f26c1-f381-460b-ca81-b8cd0b8c1864",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "ut = time.time()\n",
        "\n",
        "df_products_appends = spark.createDataFrame(\n",
        "    [\n",
        "        (\"00006\", \"Chair\", 50, \"Furniture\", ut), \n",
        "        (\"00007\", \"Desk\", 350, \"Furniture\", ut),\n",
        "    ],\n",
        "    [\"product_id\", \"product_name\", \"price\", \"category\", \"updated_at\"],\n",
        ")\n",
        "\n",
        "df_products_appends.show()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "5f34efb1-b451-42e1-980d-8eb4b3f48e18",
      "metadata": {
        "id": "5f34efb1-b451-42e1-980d-8eb4b3f48e18",
        "outputId": "c3a91e23-5edf-422a-e4f1-2e6b899d8f11",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "df_products_appends.writeTo(f\"{catalog_name}.{database_name}.{table_name}\").append()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "dcc161d0-5ff9-4b99-b9d2-8e8faf669350",
      "metadata": {
        "id": "dcc161d0-5ff9-4b99-b9d2-8e8faf669350",
        "outputId": "b409efb4-1258-4a76-9866-711306090845",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "spark.table(f\"{catalog_name}.{database_name}.{table_name}\") \\\n",
        "    .show()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "951e88f5-5d0a-444f-a85b-18f9736856c9",
      "metadata": {
        "id": "951e88f5-5d0a-444f-a85b-18f9736856c9",
        "outputId": "9afddf1c-acc4-473b-e998-a1058078710b",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "spark.table(f\"{catalog_name}.{database_name}.{table_name}.history\") \\\n",
        "    .show()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "6dbba46c-ace4-4388-b17e-c3d12d50756c",
      "metadata": {
        "id": "6dbba46c-ace4-4388-b17e-c3d12d50756c"
      },
      "source": [
        "## Overwrite records into Iceberg table"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "555aba1f-8abb-4b28-9b93-93ed6937410a",
      "metadata": {
        "id": "555aba1f-8abb-4b28-9b93-93ed6937410a",
        "outputId": "e2fbfa24-a90f-42fb-9f0d-a3a18b61b140",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "df_products = spark.table(f\"{catalog_name}.{database_name}.{table_name}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "018c1327-744f-4055-8d4f-cc666e67be03",
      "metadata": {
        "id": "018c1327-744f-4055-8d4f-cc666e67be03",
        "outputId": "75bbac08-f8cf-43b7-ceee-21e70e7cb355",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "from pyspark.sql.functions import lit\n",
        "\n",
        "ut = time.time()\n",
        "\n",
        "df_products_overwrites = df_products \\\n",
        "    .withColumn(\"price\", df_products.price*1.5) \\\n",
        "    .withColumn(\"updated_at\", lit(ut))\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "61ec5a97-f8e9-44fd-b2b8-5ddfdbcf0eac",
      "metadata": {
        "id": "61ec5a97-f8e9-44fd-b2b8-5ddfdbcf0eac",
        "outputId": "a30b5ad2-2ad7-4c94-bd04-cb5fee9088df",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "df_products_overwrites.writeTo(f\"{catalog_name}.{database_name}.{table_name}\").overwritePartitions()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "014ce305-aaaa-41f3-ab3f-02bd8508f63c",
      "metadata": {
        "id": "014ce305-aaaa-41f3-ab3f-02bd8508f63c",
        "outputId": "b2f3806a-8674-4636-a47a-3d5d7672c0f3",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "spark.table(f\"{catalog_name}.{database_name}.{table_name}\") \\\n",
        "    .show()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "5bae8d97-76f3-4b4c-b863-0ffb87ce4ce8",
      "metadata": {
        "id": "5bae8d97-76f3-4b4c-b863-0ffb87ce4ce8",
        "outputId": "e9560b49-5524-47ea-d276-d4f07f64146c",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "spark.table(f\"{catalog_name}.{database_name}.{table_name}.history\") \\\n",
        "    .show()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "6e114b41",
      "metadata": {},
      "source": [
        "## Stop Session"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "539c31ae-9c97-46da-8bb6-cdface9758a4",
      "metadata": {
        "id": "539c31ae-9c97-46da-8bb6-cdface9758a4",
        "outputId": "7e048cbf-f37a-45d2-87f0-221c3f61a529",
        "trusted": true
      },
      "outputs": [],
      "source": [
        "%stop_session"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "name": "iceberg_df.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Glue PySpark",
      "language": "python",
      "name": "glue_pyspark"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}
